rda.model <- update(rda.model.fda, data=train)
pred.test <- predict(rda.model.fda, test)
pred.test <- pred.test$class
(err.table <- table(True=test$V58, Pred=pred.test))
Pred
True 0 1 2 3 4
0 40 5 0 0 0
1 6 13 1 0 2
2 0 2 6 4 1
3 0 0 1 9 0
4 0 0 0 1 3
(err.test <- 1-sum(diag(err.table))/sum(err.table))
[1] 0.2446809
We will now use model on a dataset not yet investigated: Hungary.
hung <- read.table("../data/processed.hungarian.data", header=F, sep=',', na.strings="?")
summary(hung)
V1 V2 V3 V4
Min. :28.00 Min. :0.0000 Min. :1.000 Min. : 92.0
1st Qu.:42.00 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:120.0
Median :49.00 Median :1.0000 Median :3.000 Median :130.0
Mean :47.83 Mean :0.7245 Mean :2.983 Mean :132.6
3rd Qu.:54.00 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:140.0
Max. :66.00 Max. :1.0000 Max. :4.000 Max. :200.0
NA's :1
V5 V6 V7
Min. : 85.0 Min. :0.00000 Min. :0.0000
1st Qu.:209.0 1st Qu.:0.00000 1st Qu.:0.0000
Median :243.0 Median :0.00000 Median :0.0000
Mean :250.8 Mean :0.06993 Mean :0.2184
3rd Qu.:282.5 3rd Qu.:0.00000 3rd Qu.:0.0000
Max. :603.0 Max. :1.00000 Max. :2.0000
NA's :23 NA's :8 NA's :1
V8 V9 V10
Min. : 82.0 Min. :0.0000 Min. :0.0000
1st Qu.:122.0 1st Qu.:0.0000 1st Qu.:0.0000
Median :140.0 Median :0.0000 Median :0.0000
Mean :139.1 Mean :0.3038 Mean :0.5861
3rd Qu.:155.0 3rd Qu.:1.0000 3rd Qu.:1.0000
Max. :190.0 Max. :1.0000 Max. :5.0000
NA's :1 NA's :1
V11 V12 V13 V14
Min. :1.000 Min. :0 Min. :3.000 Min. :0.0000
1st Qu.:2.000 1st Qu.:0 1st Qu.:5.250 1st Qu.:0.0000
Median :2.000 Median :0 Median :6.000 Median :0.0000
Mean :1.894 Mean :0 Mean :5.643 Mean :0.3605
3rd Qu.:2.000 3rd Qu.:0 3rd Qu.:7.000 3rd Qu.:1.0000
Max. :3.000 Max. :0 Max. :7.000 Max. :1.0000
NA's :190 NA's :291 NA's :266
rm.var <- c("V11", "V12", "V13")
hung <- hung[,!(names(hung) %in% rm.var)]
hung[is.na(hung)] <- -9
var.imputable <- c("V4", "V5", "V6", "V7", "V8", "V9")
for (nom in var.imputable){
hung[, nom][hung[, nom] == -9] <- NA
variable <- hung[, nom]
hung[, nom] <- knn.imputation(hung, variable, nom, 7)
}
var.factor <- c("V2", "V3", "V6", "V7", "V9", "V14")
for (nom in var.factor){
hung[, nom] <- as.factor(as.character(hung[,nom]))
}
summary(hung)
V1 V2 V3 V4 V5
Min. :28.00 0: 81 1: 11 Min. : 27.0 Min. : 4.0
1st Qu.:42.00 1:213 2:106 1st Qu.:120.0 1st Qu.:198.0
Median :49.00 3: 54 Median :130.0 Median :237.0
Mean :47.83 4:123 Mean :132.2 Mean :236.6
3rd Qu.:54.00 3rd Qu.:140.0 3rd Qu.:277.0
Max. :66.00 Max. :200.0 Max. :603.0
V6 V7 V8 V9 V10 V14
0:266 0:235 Min. : 54.0 0:204 Min. :0.0000 0:188
1: 28 1: 53 1st Qu.:122.0 1: 89 1st Qu.:0.0000 1:106
2: 6 Median :140.0 2: 1 Median :0.0000
Mean :138.8 Mean :0.5861
3rd Qu.:155.0 3rd Qu.:1.0000
Max. :190.0 Max. :5.0000
names_num <- c()
for(i in 1:ncol(hung)){
if (!is.factor(hung[,i])) {
names_num <- c(names_num, i)
}
}
hung_numeric <- hung[,names_num]
hung_cor <- cor(hung_numeric)
which(hung_cor > 0.5 & hung_cor < 1, arr.ind = TRUE)
row col
which(-hung_cor > 0.5 & -hung_cor < 1, arr.ind = TRUE)
row col
par(mfrow = c(2,3))
for(i in 1:length(hung_numeric)){
qqnorm(hung_numeric[,i], main = c("Q-Q Plot: ", names(hung_numeric)[i]))
qqline(hung_numeric[,i], col=2)
}
Boxcox
par(mfrow = c(2,3))
for(i in 1:length(hung_numeric)){
boxcox(lm(hung_numeric[,i]-min(hung_numeric[,i])+1e-6~1),lambda = seq(-1, 1.5, by=0.1), xlab = c(names(hung_numeric))[i])
}
We see var 10 requires a logaritmic transformation.
hung[,"V10"] <- log(1+hung[,"V10"])
qqnorm(hung[,"V10"])
qqline(hung[,"V10"], col=2)
Separe train and test data, seed for reproducibility.
set.seed(2000)
n <- nrow(hung)
train.lenght <- round(2*n/3)
hung <- hung[sample(n),]
train <- hung[1:train.lenght,]
test <- hung[(train.lenght+1):n,]
names_num <- c()
for(i in 1:ncol(train)){
if (!is.factor(train[,i])) {
names_num <- c(names_num, i)
}
}
train_num <- train[,names_num]
Extract PCA features.
pca <- princomp(train_num)
screeplot(pca)
summary(pca)
Importance of components:
Comp.1 Comp.2 Comp.3
Standard deviation 86.0161788 24.47912423 17.80405417
Proportion of Variance 0.8849996 0.07167612 0.03791583
Cumulative Proportion 0.8849996 0.95667568 0.99459151
Comp.4 Comp.5
Standard deviation 6.709559731 0.4448637716
Proportion of Variance 0.005384815 0.0000236721
Cumulative Proportion 0.999976328 1.0000000000
Fp <- pca$scores
Gs <- pca$loadings
Fs <- Fp %*% diag(1/pca$sdev)
Gp <- Gs %*% diag(pca$sdev) *0.3
col.class <- as.numeric(train$V14)
col.class[col.class==0] <- "red"
col.class[col.class==1] <- "blue"
plot(Fs[,1], Fs[,2], asp=1, col = col.class, xlab = "First principal component", ylab = "Second principal component")
arrows(rep(0,dim(Gs)[1]),rep(0,dim(Gs)[1]), Gp[,1], Gp[,2])
text(Gp[,1], Gp[,2], names(train_num), col = "black")
legend("bottomright", fill=c("red","blue"), legend=c('0','1'))
fda <- lda(V14~., data=train)
#plot(fda)
loadings <- predict(fda)$x
plot(loadings, col = col.class)
legend("bottomright", fill=c("red","blue"), legend=c('0','1'))
train$LD1 <- loadings[,1]
fda_test <- predict(fda, newdata = test)
test$LD1 <- fda_test$x[,1]
Análisis por correspondencias
ac <- mjca(hung[,names(hung) %in% var.factor], lambda="Burt")
plot(ac, main="MCA biplot of Burt matrix with data")
ac_ind <- mjca(train[,names(train) %in% var.factor], lambda="indicator", reti = T)
plot(ac_ind$rowcoord, col = col.class)
legend("bottomright", fill=c("red","blue"), legend=c('0','1'))
mca.features <- ac_ind$rowcoord